library(dplyr)
library(leaflet)
library(geosphere)
library(data.table)
library(ggplot2)
library(lubridate)
library(caret)
library(readxl)
library(tidyverse)
library(ggthemes)
library(ggrepel)
library(reshape)
library(reshape2)
library(devtools)

Motivation:

Bikesharing has become more and more popular in major cities in the United States. Blue-bike is a Metro Boston’s public bike share program, with over 1,800 bikes and more than 200 stations across Boston, Brookline, Cambridge, and Somerville. Bluebike operates under the customer and subscriber systems. In 2017, the total number of trips is 1,313,837, and the number of subscribers is 14,577. For the individual customer, the single trip pass costs $2.5 for 30 mins ride, with $2.5 for an additional 30 mins. For the subscriber, Bluebike requires a membership fee of $99 per year and the subscriber can have unlimited rides, with each ride under 45 mins. For any additional 30 mins post 45 mins ride, the subscriber would need to pay $2.5.

Objectives:

For this project, we would like to understand

  • What are the general demographics of bike riders?
  • How does the frequency of bike riding change with time, day and month of the year?
  • Does weather affect bike riding?
  • Given Bluebike only collects information on subscribers, how can we predict the characteristics and conditions for overtime users? ## Data preparation

Approach

Starting with web scrapping we collected datasets from Boston Hubway and weather sites, focusing on the year 2017. We merged these datasets into a single csv file for our subsequent analysis. We performed exploratory analysis of bluebike riders in Boston, with the hopes of trying to have a general idea of who a typical Blue bike subscriber was, and how their riding pattern changed with changing weather. We then drilled down on bluebike subscribers who ride overtime, meaning more than 45 mins. Since these people pay an extra $2.5 per 30mins, and thus make bluebike company extra revenue. This informed our subsequent analysis, using multiple logistic regression techniques and Decision tree analysis. We tried to predict the typical oversubscriber and what day of the week such a rider would be out and about.

Read the bluebike data in 2017

# Importing the all monthly data in 2017
dat_201701<-read.csv("201701-hubway-tripdata.csv",stringsAsFactors = FALSE)
dat_201702<-read.csv("201702-hubway-tripdata.csv",stringsAsFactors = FALSE)
dat_201703<-read.csv("201703-hubway-tripdata.csv",stringsAsFactors = FALSE)
dat_201704<-read.csv("201704-hubway-tripdata.csv",stringsAsFactors = FALSE)
dat_201705<-read.csv("201705-hubway-tripdata.csv",stringsAsFactors = FALSE)
dat_201706<-read.csv("201706-hubway-tripdata.csv",stringsAsFactors = FALSE)
dat_201707<-read.csv("201707-hubway-tripdata.csv",stringsAsFactors = FALSE)
dat_201708<-read.csv("201708-hubway-tripdata.csv",stringsAsFactors = FALSE)
dat_201709<-read.csv("201709-hubway-tripdata.csv",stringsAsFactors = FALSE)
dat_201710<-read.csv("201710-hubway-tripdata.csv",stringsAsFactors = FALSE)
dat_201711<-read.csv("201711-hubway-tripdata.csv",stringsAsFactors = FALSE)
dat_201712<-read.csv("201712-hubway-tripdata.csv",stringsAsFactors = FALSE)

# Combine them
dat_2017 <- rbind(dat_201701, dat_201702, dat_201703, dat_201704, dat_201705,
                  dat_201706, dat_201707, dat_201708, dat_201709, dat_201710,
                  dat_201711, dat_201712)

Add variables

## age, age_cat, duration_min, year, month, month_abb, day, hour, wday, weekend
bbike <- dat_2017 %>%
  mutate(birth.year = as.numeric(birth.year)) %>%
        mutate(age = 2017 - birth.year) %>%
  mutate(age_cat = case_when(
    .$age >= 10 & .$age < 20 ~ 1,
    .$age >= 20 & .$age < 30 ~ 2,
    .$age >= 30 & .$age < 40 ~ 3,
    .$age >= 40 & .$age < 50 ~ 4,
    .$age >= 50 & .$age < 60 ~ 5,
    .$age >= 60 & .$age < 70 ~ 6,
    .$age >= 70 & .$age < 80 ~ 7,
    .$age >= 80 ~ 8)) %>%
  mutate(duration_min = tripduration / 60) %>%
  mutate(year = year(starttime), 
         month = month(starttime),
         month_abb = month(starttime, label = TRUE, abbr = TRUE), 
         day = day(starttime),
         hour = hour(starttime), 
         wday = wday(starttime, label = TRUE, abbr = TRUE))
## Trip distance (km)
setDT(bbike)[ , dist_km := distGeo(matrix(c(start.station.longitude, start.station.latitude), ncol = 2),matrix(c(end.station.longitude, end.station.latitude), ncol = 2))/1000]
bbike <- as.data.frame(bbike)
## overtime (if duration_min >  45, 1, 0)
bbike <- bbike %>%
  mutate(overtime = ifelse(duration_min > 45, 1, 0))
## user_start, user_end: number of users at the start/end station
bbike <- bbike %>%
  group_by(start.station.id) %>%
  mutate(user_start = n())

bbike <- bbike %>%
  group_by(end.station.id) %>%
  mutate(user_end = n())

Combine weather information

## temp_max, temp_min, rain, snownice(snow or ice)
weather <- read_excel("boston_weather.xls")
bbike <- bbike %>%
  group_by(year, month, day) %>%
  left_join(., weather, by = c("year", "month", "day")) 

Motivation

The number of bluelike users has increased over several years. Among subscribers, if they pay $99 per year, they can use it unlimitedly. However, there is a time limit, for 45 minutes per once. We are going to figure out when they do not return their bike within 45 minutes and predict the pattern.

Membership and Ridership More than 8 million trips have been taken by Bluebikes riders since the 2011 launch (as of 12/2018) An estimated 87,000 unique riders took trips in 2016

bbike_summary <- read_excel("bluebike_summary.xlsx")
bbike_summary <- as.data.frame(bbike_summary)
plot <- melt(bbike_summary, id.vars = "year") %>%
  filter(variable == "subscriber" | variable == "customer") 
plot %>% 
  ggplot(aes(year, value)) +
  geom_bar(aes(fill = variable), stat = "identity", position = "stack") +
  scale_fill_manual(values = c("#FF8F1C", "#0050B5")) +
  scale_x_continuous(breaks = c(2011, 2012, 2013, 2014, 2015, 2016, 2017)) +
  xlab("Year") +
  ylab("Riders") +
  ggtitle("Number of Riders since 2011") +
  theme_bw()

bbike_summary %>%
  ggplot(aes(year, total_trips)) +
  geom_point(color = "#FF8F1C", size = 2) +
  geom_line(color = "#1D428A", alpha = 0.7) +
  xlab("Year") +
  ylab("Numbers") +
  ggtitle("Total Number of Trips since 2011") +
  theme_bw()

bbike %>%
  mutate(gender = as.factor(gender)) %>%
  filter(gender != "0") %>%  #1 is male #2 is female #0 is unknown
  ggplot(aes(age_cat)) +
  geom_bar(aes(fill = gender)) +
  scale_fill_brewer( palette = "Oranges")+
  xlab("Age category (10s, 20s, 30s etc)") +
  ylab("Number of rides") +
  ggtitle("Total Number of Trips by gender by age") +
  theme_bw()

Blue bike Stations

# Create data frame
dat_station<- read.csv("Hubway_Stations_as_of_July_2017.csv")
dat_2017_station <- bbike %>%
filter(!birth.year=="\\N")%>%
                        filter(birth.year>1900 & birth.year<=2017)%>%
  group_by(start.station.id) %>% summarize(number = n(), start.station.latitude=first(start.station.latitude), start.station.longitude=first(start.station.longitude),start.station.name=first(start.station.name))%>% filter(start.station.latitude>0)
summary(dat_2017_station)
##  start.station.id     number      start.station.latitude
##  Min.   :  1.00   Min.   :    4   Min.   :42.30         
##  1st Qu.: 54.75   1st Qu.: 1794   1st Qu.:42.34         
##  Median :108.50   Median : 4750   Median :42.36         
##  Mean   :111.91   Mean   : 5625   Mean   :42.36         
##  3rd Qu.:173.25   3rd Qu.: 7614   3rd Qu.:42.37         
##  Max.   :232.00   Max.   :35702   Max.   :42.41         
##  start.station.longitude start.station.name
##  Min.   :-71.17          Length:196        
##  1st Qu.:-71.11          Class :character  
##  Median :-71.08          Mode  :character  
##  Mean   :-71.09                            
##  3rd Qu.:-71.06                            
##  Max.   :-71.01

Map about number of users at each station

# Distinguish stations by color based on the number of users
getColor <- function(df){
  sapply(df$number, function(number) {
  if(number %in% 1:4000) {
    "green"
  } else if(number %in% 4001:10000) {
    "orange"
  } else if(number >10000) {
    "red"
  } else {
    "blue"
  } })
}

icons <- awesomeIcons(
  icon = 'ios-close',
  iconColor = 'black',
  library = 'ion',
  markerColor = getColor(dat_2017_station)
)

leaflet(dat_2017_station) %>% addTiles() %>%
  addAwesomeMarkers(~start.station.longitude
, ~start.station.latitude, icon=icons, label=~as.character(number), popup = ~start.station.name)

Hourly

bbike %>% ggplot(aes(hour))+
  geom_bar()+
  scale_x_continuous(breaks=(c(0,4,8,12,16,20)))+
  xlab("Hour")+
  ylab("Number of users") +
        facet_wrap(~ wday)

bbike %>% ggplot(aes(hour))+
  geom_bar(aes(fill = factor(gender)))+
  scale_x_continuous(breaks=(c(0,4,8,12,16,20)))+
  xlab("Hour")+
  ylab("Number of users") +
        facet_wrap(~ month) +
        scale_fill_discrete("Gender", labels = c("Unknown", "Male", "Female"))

Distribution of Trip Duration

bbike_member <- bbike %>%
  filter(usertype == "Subscriber")
summary(bbike_member$duration_min)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##     1.02     6.10     9.78    13.31    15.48 61276.80

As you can see, the data seems to have wrong information. The very long tripduration might be attributable to lost or other errors. Therefore, we limit the range from 0 to 75 minutes for the duration in this study.

bbike_member <- bbike_member %>%
  filter(duration_min < 50)
bbike_member %>%
  mutate(group = ifelse(rain == 0, "no rain", "rain")) %>%
  ggplot(aes(duration_min, y = ..count.., fill = group)) +
  geom_density(alpha = 0.2) +
  xlab("Trip Duration (min)") +
  ylab("Riders")

bbike_member %>%
  mutate(gender = as.factor(gender)) %>%
  ggplot(aes(duration_min, y = ..count.., fill = gender)) +
  geom_density(alpha = 0.2) +
  xlab("Trip Duration (min)") +
  ylab("Riders")

Trip distance

# Distance
bbike %>% 
        filter(birth.year > 1900 & birth.year <= 2017)%>%
        group_by(age_cat) %>% 
        summarize(avg = mean(dist_km), se = sd(dist_km) / sqrt(n())) %>% 
  ggplot(aes(age_cat, avg))+ 
  geom_errorbar(aes(ymin = avg - 2*se, ymax = avg+2 * se), width = 0.2)+
  geom_point(color = "#FF8F1C")+
  geom_line(color = "#1D428A")+
  scale_x_continuous(breaks=(c(1,2,3,4,5,6,7,8)), labels=c("10-20","20-30","30-40","40-50","50-60","60-70","70-80","80-"))+
  xlab(expression(paste(Age, " (years)")))+
  ylab(expression(paste(Distance," (km)"))) +
        theme_bw()

There is no consistent trend between age and trip distance.

Machine Learning - using logistic regression to predict overtime users characteristics

Rationale: With an annual membership, users can ride unlimited bike ride for up to 45 mins. Additional $2.5 will be charged per 30 mins for members. In this analysis, we are interested in learning about the characteristics of overtime riders so that Bluebike can gain extra revenue.

Only keep memebers in the dataset

bbikemember <- bbike %>%
        ungroup() %>%
  filter(usertype == "Subscriber" & duration_min < 75) %>%
        mutate(gender = as.factor(gender), day = as.factor(day),
               hour = as.factor(hour)) %>%
        mutate(rain_cat = ifelse(rain == 0, 0, 1)) %>%
        mutate(snownice_cat = ifelse(snownice == 0, 0, 1)) %>%
        mutate(overtime = ifelse(duration_min < 15, 0, 1)) %>%
        mutate(overtime = as.factor(overtime)) %>%
        mutate(satsun = ifelse(wday %in% c("Sat", "Sun"), 1, 0)) %>%
        mutate(satsun = as.factor(satsun))

logistic regression

set.seed(1)

library(caret)

Train <- createDataPartition(bbikemember$overtime, p=0.5, list=FALSE)
training <- bbikemember[ Train, ]
testing <- bbikemember[ -Train, ]

bbikemember$overtime = as.factor(as.numeric(as.character(bbikemember $overtime)))

#overall accuracy 
p = 0.358  #290235/810623
y_hat <- sample(c("0","1"), length(testing), replace = TRUE, prob=c(p, 1-p)) %>% 
  factor(levels = levels(bbikemember$overtime))
mean(y_hat == testing$overtime)
## Warning in `==.default`(y_hat, testing$overtime): longer object length is
## not a multiple of shorter object length
## Warning in is.na(e1) | is.na(e2): longer object length is not a multiple of
## shorter object length
## [1] 0.4254289
#logistic regression
glm.fit <- training %>% 
  glm(overtime ~ gender + age + month + satsun + rain_cat +
               snownice_cat + user_start + user_end, data=., family = "binomial")

summary(glm.fit)
## 
## Call:
## glm(formula = overtime ~ gender + age + month + satsun + rain_cat + 
##     snownice_cat + user_start + user_end, family = "binomial", 
##     data = .)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.5008  -0.8277  -0.7032   1.3467   2.6214  
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   6.371e-02  4.494e-02   1.418    0.156    
## gender1      -7.974e-01  4.228e-02 -18.860  < 2e-16 ***
## gender2      -4.754e-01  4.254e-02 -11.175  < 2e-16 ***
## age           6.794e-03  2.698e-04  25.182  < 2e-16 ***
## month         6.075e-03  1.242e-03   4.891 1.01e-06 ***
## satsun1       2.427e-01  7.845e-03  30.933  < 2e-16 ***
## rain_cat     -9.405e-02  6.836e-03 -13.758  < 2e-16 ***
## snownice_cat -6.017e-01  4.122e-02 -14.597  < 2e-16 ***
## user_start   -2.917e-05  4.176e-07 -69.851  < 2e-16 ***
## user_end     -2.679e-05  3.878e-07 -69.086  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 633720  on 549482  degrees of freedom
## Residual deviance: 615384  on 549473  degrees of freedom
##   (947 observations deleted due to missingness)
## AIC: 615404
## 
## Number of Fisher Scoring iterations: 4
#prediction
p_hat <- predict(glm.fit, newdata=testing, type="response")
y_hat <- ifelse(p_hat > 0.5, 1, 0) %>% factor()

#confusion matrix 
table(predicted = y_hat, actual = testing$overtime)
##          actual
## predicted      0      1
##         0 404103 144268
##         1    601    518
confusionMatrix(data = factor(y_hat), reference = factor(testing$overtime))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction      0      1
##          0 404103 144268
##          1    601    518
##                                           
##                Accuracy : 0.7364          
##                  95% CI : (0.7352, 0.7375)
##     No Information Rate : 0.7365          
##     P-Value [Acc > NIR] : 0.601           
##                                           
##                   Kappa : 0.0031          
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.998515        
##             Specificity : 0.003578        
##          Pos Pred Value : 0.736915        
##          Neg Pred Value : 0.462913        
##              Prevalence : 0.736508        
##          Detection Rate : 0.735415        
##    Detection Prevalence : 0.997964        
##       Balanced Accuracy : 0.501046        
##                                           
##        'Positive' Class : 0               
## 

ROC

library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
model <- training %>%
 glm(overtime ~ gender + age + month + satsun + rain_cat +
              snownice_cat + user_start + user_end, data=., family = "binomial")
#First example
predpr <- predict(model, newdata=testing, type="response")
roccurve<- roc(testing$overtime~predpr)
plot(roccurve)

roccurve
## 
## Call:
## roc.formula(formula = testing$overtime ~ predpr)
## 
## Data: predpr in 404704 controls (testing$overtime 0) < 144786 cases (testing$overtime 1).
## Area under the curve: 0.6135
# Second example
predict <- predict(model, newdata=testing, type = "response")
library(ROCR)
## Loading required package: gplots
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess
ROCRpred <- prediction(predict, testing$overtime)
ROCRperf <- performance(ROCRpred, 'tpr','fpr')
plot(ROCRperf, colorize = F, text.adj = c(-0.2,1.7))

library(purrr)
library(caret)
library(ggplot2)

probs <- seq(0, 1, length.out = 10)
guessing <- map_df(probs, function(p){
  y_hat <- 
    sample(c("0", "1"), length(testing), replace = TRUE, prob=c(p, 1-p)) %>%
    factor(levels = levels(bbikemember$overtime))
  list(method = "Guessing",
       FPR = 1 - specificity(y_hat, bbikemember$overtime),
       TPR = sensitivity(y_hat, bbikemember$overtime))
})
## Warning in complete.cases(data) & complete.cases(reference): longer object
## length is not a multiple of shorter object length
## Warning in data %in% negative & reference %in% negative: longer object
## length is not a multiple of shorter object length
## Warning in complete.cases(data) & complete.cases(reference): longer object
## length is not a multiple of shorter object length
## Warning in data %in% positive & reference %in% positive: longer object
## length is not a multiple of shorter object length
## Warning in complete.cases(data) & complete.cases(reference): longer object
## length is not a multiple of shorter object length
## Warning in data %in% negative & reference %in% negative: longer object
## length is not a multiple of shorter object length
## Warning in complete.cases(data) & complete.cases(reference): longer object
## length is not a multiple of shorter object length
## Warning in data %in% positive & reference %in% positive: longer object
## length is not a multiple of shorter object length
## Warning in complete.cases(data) & complete.cases(reference): longer object
## length is not a multiple of shorter object length
## Warning in data %in% negative & reference %in% negative: longer object
## length is not a multiple of shorter object length
## Warning in complete.cases(data) & complete.cases(reference): longer object
## length is not a multiple of shorter object length
## Warning in data %in% positive & reference %in% positive: longer object
## length is not a multiple of shorter object length
## Warning in complete.cases(data) & complete.cases(reference): longer object
## length is not a multiple of shorter object length
## Warning in data %in% negative & reference %in% negative: longer object
## length is not a multiple of shorter object length
## Warning in complete.cases(data) & complete.cases(reference): longer object
## length is not a multiple of shorter object length
## Warning in data %in% positive & reference %in% positive: longer object
## length is not a multiple of shorter object length
## Warning in complete.cases(data) & complete.cases(reference): longer object
## length is not a multiple of shorter object length
## Warning in data %in% negative & reference %in% negative: longer object
## length is not a multiple of shorter object length
## Warning in complete.cases(data) & complete.cases(reference): longer object
## length is not a multiple of shorter object length
## Warning in data %in% positive & reference %in% positive: longer object
## length is not a multiple of shorter object length
## Warning in complete.cases(data) & complete.cases(reference): longer object
## length is not a multiple of shorter object length
## Warning in data %in% negative & reference %in% negative: longer object
## length is not a multiple of shorter object length
## Warning in complete.cases(data) & complete.cases(reference): longer object
## length is not a multiple of shorter object length
## Warning in data %in% positive & reference %in% positive: longer object
## length is not a multiple of shorter object length
## Warning in complete.cases(data) & complete.cases(reference): longer object
## length is not a multiple of shorter object length
## Warning in data %in% negative & reference %in% negative: longer object
## length is not a multiple of shorter object length
## Warning in complete.cases(data) & complete.cases(reference): longer object
## length is not a multiple of shorter object length
## Warning in data %in% positive & reference %in% positive: longer object
## length is not a multiple of shorter object length
## Warning in complete.cases(data) & complete.cases(reference): longer object
## length is not a multiple of shorter object length
## Warning in data %in% negative & reference %in% negative: longer object
## length is not a multiple of shorter object length
## Warning in complete.cases(data) & complete.cases(reference): longer object
## length is not a multiple of shorter object length
## Warning in data %in% positive & reference %in% positive: longer object
## length is not a multiple of shorter object length
## Warning in complete.cases(data) & complete.cases(reference): longer object
## length is not a multiple of shorter object length
## Warning in data %in% negative & reference %in% negative: longer object
## length is not a multiple of shorter object length
## Warning in complete.cases(data) & complete.cases(reference): longer object
## length is not a multiple of shorter object length
## Warning in data %in% positive & reference %in% positive: longer object
## length is not a multiple of shorter object length
## Warning in complete.cases(data) & complete.cases(reference): longer object
## length is not a multiple of shorter object length
## Warning in data %in% negative & reference %in% negative: longer object
## length is not a multiple of shorter object length
## Warning in complete.cases(data) & complete.cases(reference): longer object
## length is not a multiple of shorter object length
## Warning in data %in% positive & reference %in% positive: longer object
## length is not a multiple of shorter object length
guessing %>% qplot(FPR, TPR, data =., xlab = "1 - Specificity", ylab = "Sensitivity")

Decision tree

library(party)
## Loading required package: grid
## Loading required package: mvtnorm
## Loading required package: modeltools
## Loading required package: stats4
## Loading required package: strucchange
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## Loading required package: sandwich
## 
## Attaching package: 'strucchange'
## The following object is masked from 'package:stringr':
## 
##     boundary
bbikemember <- bbikemember %>%
        filter(gender != 0)
output_tree <- ctree(overtime ~ gender + factor(satsun),
                     data = bbikemember)
plot(output_tree)